purrr for quick summary tables - St Koopurrr - Anna Quaglierimutate, across and case_when - Anna Quaglieripurrr for quick summary tables - St KooAdapted from this fantasic Learn to Purrr tutorial
# Load example data
data("mtcars")
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
You can use purrr to loop over all the columns, and output the info into a dataframe.
In this example, I want to see variable type, max character length, and missingness.
library(tidyverse) # Includes purrr
# Create dataframe with some summary info
summary_df <- mtcars %>%
purrr::map_df(
~data.frame(
class = class(.x),
max_char = max(nchar(.x, keepNA = FALSE)),
missing = sum(is.na(.x))
), .id ="col_name"
)
summary_df
## col_name class max_char missing
## 1 mpg numeric 4 0
## 2 cyl numeric 1 0
## 3 disp numeric 5 0
## 4 hp numeric 3 0
## 5 drat numeric 4 0
## 6 wt numeric 5 0
## 7 qsec numeric 5 0
## 8 vs numeric 1 0
## 9 am numeric 1 0
## 10 gear numeric 1 0
## 11 carb numeric 1 0
And because it’s a dataframe, you can use a package like kableExtra to format it for reports.
library(kableExtra)
summary_df %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
| col_name | class | max_char | missing |
|---|---|---|---|
| mpg | numeric | 4 | 0 |
| cyl | numeric | 1 | 0 |
| disp | numeric | 5 | 0 |
| hp | numeric | 3 | 0 |
| drat | numeric | 4 | 0 |
| wt | numeric | 5 | 0 |
| qsec | numeric | 5 | 0 |
| vs | numeric | 1 | 0 |
| am | numeric | 1 | 0 |
| gear | numeric | 1 | 0 |
| carb | numeric | 1 | 0 |
purrr - Anna QuaglieriFor the example I am going to use the flights dataset from the R package nycflights13. I am going to fit linear model that tries to explain the arr_time as a function of dep_time and arr_delay.
library(nycflights13)
library(purrr)
flights %>%
dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
head()
## # A tibble: 6 x 4
## arr_time dep_time arr_delay carrier
## <int> <int> <dbl> <chr>
## 1 830 517 11 UA
## 2 850 533 20 UA
## 3 923 542 33 AA
## 4 1004 544 -18 B6
## 5 812 554 -25 DL
## 6 740 554 12 UA
To fit the model to the whole dataset we would use the following code:
summary(lm(arr_time ~ dep_time + arr_delay, data = flights))
##
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2250.59 -68.14 50.97 169.26 2342.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 491.250510 2.045943 240.1 <2e-16 ***
## dep_time 0.757657 0.001446 524.1 <2e-16 ***
## arr_delay -1.633355 0.015815 -103.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 392.8 on 327343 degrees of freedom
## (9430 observations deleted due to missingness)
## Multiple R-squared: 0.4566, Adjusted R-squared: 0.4566
## F-statistic: 1.375e+05 on 2 and 327343 DF, p-value: < 2.2e-16
What if we wanted to fit separate models by carrier?
models <- flights %>%
dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
tidyr::nest(-carrier) %>%
dplyr::mutate(fit = purrr::map(data, ~ lm(arr_time ~ dep_time + arr_delay, data = .))) %>%
dplyr::mutate(results_fit = purrr::map(fit, function(f) confint(f)))
models
## # A tibble: 16 x 4
## carrier data fit results_fit
## <chr> <list> <list> <list>
## 1 UA <tibble [58,665 × 3]> <lm> <dbl[,2] [3 × 2]>
## 2 AA <tibble [32,729 × 3]> <lm> <dbl[,2] [3 × 2]>
## 3 B6 <tibble [54,635 × 3]> <lm> <dbl[,2] [3 × 2]>
## 4 DL <tibble [48,110 × 3]> <lm> <dbl[,2] [3 × 2]>
## 5 EV <tibble [54,173 × 3]> <lm> <dbl[,2] [3 × 2]>
## 6 MQ <tibble [26,397 × 3]> <lm> <dbl[,2] [3 × 2]>
## 7 US <tibble [20,536 × 3]> <lm> <dbl[,2] [3 × 2]>
## 8 WN <tibble [12,275 × 3]> <lm> <dbl[,2] [3 × 2]>
## 9 VX <tibble [5,162 × 3]> <lm> <dbl[,2] [3 × 2]>
## 10 FL <tibble [3,260 × 3]> <lm> <dbl[,2] [3 × 2]>
## 11 AS <tibble [714 × 3]> <lm> <dbl[,2] [3 × 2]>
## 12 9E <tibble [18,460 × 3]> <lm> <dbl[,2] [3 × 2]>
## 13 F9 <tibble [685 × 3]> <lm> <dbl[,2] [3 × 2]>
## 14 HA <tibble [342 × 3]> <lm> <dbl[,2] [3 × 2]>
## 15 YV <tibble [601 × 3]> <lm> <dbl[,2] [3 × 2]>
## 16 OO <tibble [32 × 3]> <lm> <dbl[,2] [3 × 2]>
expand_models <- models %>%
tidyr::unnest(results_fit, .drop=TRUE)
expand_models
## # A tibble: 48 x 4
## carrier data fit results_fit[,1] [,2]
## <chr> <list> <list> <dbl> <dbl>
## 1 UA <tibble [58,665 × 3]> <lm> 491. 511.
## 2 UA <tibble [58,665 × 3]> <lm> 0.757 0.771
## 3 UA <tibble [58,665 × 3]> <lm> -1.83 -1.66
## 4 AA <tibble [32,729 × 3]> <lm> 434. 455.
## 5 AA <tibble [32,729 × 3]> <lm> 0.822 0.838
## 6 AA <tibble [32,729 × 3]> <lm> -1.02 -0.852
## 7 B6 <tibble [54,635 × 3]> <lm> 843. 870.
## 8 B6 <tibble [54,635 × 3]> <lm> 0.406 0.425
## 9 B6 <tibble [54,635 × 3]> <lm> -2.66 -2.42
## 10 DL <tibble [48,110 × 3]> <lm> 418. 436.
## # … with 38 more rows
expand_models$fit[1]
## [[1]]
##
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = .)
##
## Coefficients:
## (Intercept) dep_time arr_delay
## 501.083 0.764 -1.746
mutate, across and case_when - Anna QuaglieriI found the method below really useful to recode the levels of one or several columns!
library(dplyr)
test_data <- data.frame(area = rep(c("North", "Sud", "East", "West"),times = c(2,3,4,1)),
quality_before = c("High","Low",
"High","Low","Medium",
"Medium","Low","High","High",
"Low"),
quality_after = c("High","High",
"High","Medium","Medium",
"Low","Low","High","High",
"Low"))
test_data %>%
mutate(across(.cols = c(quality_before, quality_after),
~ case_when(
. == "Low" ~ 0,
. == "Medium" ~ 1,
. == "High" ~ 2
)
)
)
## area quality_before quality_after
## 1 North 2 2
## 2 North 0 2
## 3 Sud 2 2
## 4 Sud 0 1
## 5 Sud 1 1
## 6 East 1 0
## 7 East 0 0
## 8 East 2 2
## 9 East 2 2
## 10 West 0 0
Strongly suggest to have a look at other functions and applications to perform column-wise operations https://cran.r-project.org/web/packages/dplyr/vignettes/colwise.html.
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] nycflights13_1.0.1 kableExtra_1.3.1 forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.0 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0
## [9] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0 knitr_1.30
## [13] png_0.1-7 magick_2.4.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.19 haven_2.3.1 lattice_0.20-41
## [5] colorspace_2.0-0 vctrs_0.3.4 generics_0.0.2 viridisLite_0.3.0
## [9] htmltools_0.5.0 yaml_2.2.1 utf8_1.1.4 blob_1.2.1
## [13] rlang_0.4.8 pillar_1.4.6 withr_2.3.0 glue_1.4.2
## [17] DBI_1.1.0 dbplyr_1.4.4 modelr_0.1.8 readxl_1.3.1
## [21] lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [25] rvest_0.3.5 evaluate_0.14 fansi_0.4.1 highr_0.8
## [29] broom_0.5.6 Rcpp_1.0.5 backports_1.2.0 scales_1.1.1
## [33] webshot_0.5.2 jsonlite_1.7.1 fs_1.4.2 hms_0.5.3
## [37] digest_0.6.27 stringi_1.5.3 cli_2.1.0 tools_4.0.2
## [41] magrittr_1.5 crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1
## [45] xml2_1.3.2 reprex_0.3.0 lubridate_1.7.9 assertthat_0.2.1
## [49] rmarkdown_2.3 httr_1.4.2 rstudioapi_0.13 R6_2.5.0
## [53] nlme_3.1-148 compiler_4.0.2
I code in tidyverse universe plus tidylog to output all message corresponding to changes made to vector, dataframe, tibble, etc. Please find tidylog package @ https://github.com/elbersb/tidylog.
library(tidyverse)
# colnames(iris)
# summary(iris)
# str(iris)
library(Hmisc)
iris$Species %>% describe
## .
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)
new_dt <- iris %>%
filter(Sepal.Length >= 4.6) %>%
mutate(new_name = case_when(
Species == "versicolor" ~ "V",
Species == "setosa" ~ "S"))
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)
new_dt <- iris %>%
filter(Sepal.Length >= 4.6) %>%
mutate(new_name = case_when(
Species == "versicolor" ~ "Versicolor",
Species == "setosa" ~ "Setosa",
TRUE ~ "Virginica"))
Use iris dataframe as an example.
dt <- head(iris,5)
# dt %>% select("Species", everything(.))
dt %>% relocate("Species", .before = "Sepal.Length")
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
## 4 setosa 4.6 3.1 1.5 0.2
## 5 setosa 5.0 3.6 1.4 0.2
# dt %>% relocate(where(is.numeric), .after = where(is.factor))
library(devtools)
# install_github("mrdwab/SOfun", force=TRUE)
library(SOfun)
v <- letters[1:7]
v %>% moveMe(., "a last; b, e, g before d; c first; g after b")
## [1] "c" "b" "g" "e" "d" "f" "a"
hidden_na_dt <- data.frame(
"student" = rep(c("A", "B", "C"),2),
"assignment" = rep(c("A1", "A2"),3),
"mark" = c(NA, runif(n = 5, min = 45, max = 100))
) %>%
filter(!is.na(mark))
hidden_na_dt
## student assignment mark
## 1 B A2 81.99622
## 2 C A1 61.86270
## 3 A A2 76.38823
## 4 B A1 87.38820
## 5 C A2 83.44546
hidden_na_dt %>%
complete(student, nesting(assignment), fill = list(mark = 0))
## # A tibble: 6 x 3
## student assignment mark
## <chr> <chr> <dbl>
## 1 A A1 0
## 2 A A2 76.4
## 3 B A1 87.4
## 4 B A2 82.0
## 5 C A1 61.9
## 6 C A2 83.4
I believe ggplot2 / plotly is relative popular in practice. I also recommend highercharter to visualize timeseries data and/or visNetwork / igraph / ggraph to visualize networks. My focus today is labeling inside a chart, so that I will use ggplot2 to demonstrate.
plt_original <- population %>%
filter(country %in% c("India", "United States of America", "Viet Nam",
"Lao People's Democratic Republic")) %>%
ggplot(aes(x = year, y = population, group = country, color = country))+
geom_line()
plt_original
The purpose of having customized functions is to improve readability and reduce cognitive load for digesting information provided by visualization.
si_num <- function (x) {
if (!is.na(x)) {
if (x < 0){
sign <- "-"
x <- abs(x)
}else{
sign <- ""
x <- x
}
if (x >= 1e12) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-12)] %>% length();
rem <- chrs[seq(1,length(chrs)-11)];
rem <- append(rem, ".", after = len) %>% append("T");
}
if (x >= 1e9) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-9)] %>% length();
rem <- chrs[seq(1,length(chrs)-8)];
rem <- append(rem, ".", after = len) %>% append("B");
}
else if (x >= 1e6) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-6)] %>% length();
rem <- chrs[seq(1,length(chrs)-5)]
rem <- append(rem, ".", after = len) %>% append("M");
}
else if (x >= 1e3) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-3)] %>% length();
rem <- chrs[seq(1,length(chrs)-2)];
rem <- append(rem, ".", after = len) %>% append("K");
}
else {
return(x);
}
return(str_c(sign, paste(rem, sep="", collapse=""), sep = ""));
}
else return(NA);
}
si_vec <- function(x) {
sapply(x, FUN=si_num);
}
library(hrbrthemes)
library(scales)
library(ggrepel)
library(cowplot)
year_series <- unique(population$year)
reminder <- (max(year_series) - min(year_series)) %% 4
new_breaks <- seq(from = min(year_series) + reminder, to = max(year_series), by = 4)
df <- population %>%
filter(country %in% c("India", "United States of America", "Viet Nam",
"Lao People's Democratic Republic"))
df_end <- df %>%
group_by(country) %>%
filter(year == max(year)) %>%
ungroup()
plt_adjust <- df %>%
ggplot(aes(x = year, y = population, group = country, color = country))+
geom_line()+
geom_point()+
geom_text_repel(
data = df_end,
aes(label = str_wrap(country,25)),
nudge_x = 1,
direction = "y",## nudge vertically
size = 3,
hjust = 0, ### left aligned
segment.size = 0.3, ### from here is about the line to connect the data point and text
min.segment.length = 0,
segment.color = "grey60") +
theme_ipsum() +
theme(legend.position = "none") +
scale_y_continuous(labels = si_vec)+
scale_x_continuous(breaks = new_breaks, limits = c(NA, 2020))+
labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")
plt_original
plt_adjust
library(plotly)
plt_plotly <- df %>%
mutate(text = str_c("Country: ", country, "\n",
"Year: ", year, "\n",
"Population: ", si_vec(population))) %>%
ggplot(aes(x = year, y = population, group = country, color = country, text = text))+
geom_line()+
geom_point()+
theme_ipsum() +
theme(legend.position = "none") +
scale_y_continuous(labels = si_vec)+
scale_x_continuous(breaks = new_breaks)+
labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")
ggplotly({plt_plotly}, tooltip = "text")
To be continue, I have coded many interactive plots in shinyapps, and some can be found from https://coffeeandplot.com/apps/. This is a relatively new website we created couples of month ago. Get in touch if you have any suggestions. Please find me @ https://www.linkedin.com/in/ytsong/.
This section describes how to render an RMarkdown report within a simple R conda environment on a Command Line Interface (cluster or linux environment). This could be achieved in two possible ways:
environment.yml file that documents the package dependenciesBoth work but the second way is the recommended one, which will be described below.
Create an environment.yml file, that looks something like
#name of the conda environment
name: HowRYou
#the paths that conda takes a look for packages. Avoid using anaconda channel as we have
#experienced issues using it
channels:
- conda-forge
- bioconda
- defaults
#install following packages in the conda environment
#change according to the packages you are using in your RMardown file.
#The first three are required (are R essentail). You can also change the versions to
# meet the requirements of your analysis
dependencies:
- r-base=3.4.1
- pandoc=1.19
- r-rmarkdown=1.6
- r-hereCreate a conda environment (in this case HowRYou is the conda environment name specified in the environment.yml file. -p flag should point to your miniconda installation path. To find how to install conda, check this
conda env create -p /path/to/miniconda/envs/HowRYou --file environment.ymlActivate this conda environment
conda activate HowRYouRun the RMarkdown file
Rscript -e "rmarkdown::render('HowRYou.Rmd')"
To pass arguments to the Rmd script (in this case two arguments - an input directory location and name of the input vcf file)
Rscript -e "rmarkdown::render('HowRYou.Rmd', params = list(directory = './data', file = 'dummy.txt'))"An example of a rendered script used in the above step # 4
# install.packages(c("tidyverse", "patchwork, palmerpenguins", "devtools"))
# devtools::install_github("bahlolab/ggwehi")
library(tidyverse)
library(patchwork)
# library(ggwehi)
library(palmerpenguins)
plot_1 <- penguins %>%
ggplot(aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
# scale_color_wehi() +
labs(x = "Flipper length (mm)",
y = "Body mass (g)",
color = "Penguin species") +
theme_minimal()
plot_2 <- penguins %>%
ggplot(aes(x = flipper_length_mm, fill = species)) +
geom_histogram(alpha = 0.5, position = "identity") +
# scale_fill_wehi() +
labs(x = "Flipper length (mm)",
y = "Frequency",
fill = "Penguin species") +
theme_minimal()
plot_1 + plot_2 + # plot them side-by-side
plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels
plot_1 / plot_2 + # plot them one over the other
plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels
# install.packages(c("wordcloud","RColorBrewer))
library(wordcloud) #wordcloud for generating word cloud images
library(RColorBrewer) #RColorBrewer for color palettes
words <- c("RLadies", "Rmarkdown", "tips", "tricks", "script", "rmd", "console", "packages", "share", "4thanniversary", "celebrate",
"RSoftware", "Australia", "Melbourne", "Girls", "Learn","Teach", "Data Structure", "Algorithm", "Visualisation",
"Code", "Data", "ggplot2", "Zoom", "Help", "Text", "RStudio", "programing", "questions", "answers", "Plot", "happy")
freqs <- c(980, 900, 498, 811, 800, 654, 489, 90, 254, 500, 600, 200, 488, 400, 140, 250, 357, 789, 147, 120, 590, 741, 100, 788, 812, 693, 410, 753, 95, 80, 594, 644)
set.seed(3)
wordcloud(words = words,freqs = freqs, scale=c(4,.4), max.words = 1000, random.order=TRUE, random.color=TRUE, colors = brewer.pal(8, "Accent"))
In a code chunk or in an R script, insert a timestamp with ts + shift + tab.
First write ts:
ts
## function (data = NA, start = 1, end = numeric(), frequency = 1,
## deltat = 1, ts.eps = getOption("ts.eps"), class = if (nseries >
## 1) c("mts", "ts", "matrix") else "ts", names = if (!is.null(dimnames(data))) colnames(data) else paste("Series",
## seq(nseries)))
## {
## if (is.data.frame(data))
## data <- data.matrix(data)
## if (is.matrix(data)) {
## nseries <- ncol(data)
## ndata <- nrow(data)
## dimnames(data) <- list(NULL, names)
## }
## else {
## nseries <- 1
## ndata <- length(data)
## }
## if (ndata == 0)
## stop("'ts' object must have one or more observations")
## if (missing(frequency))
## frequency <- 1/deltat
## else if (missing(deltat))
## deltat <- 1/frequency
## if (frequency > 1 && 0 < (d <- abs(frequency - round(frequency))) &&
## d < ts.eps)
## frequency <- round(frequency)
## if (!missing(start))
## start <- as.numeric(start)
## if (length(start) > 1L) {
## start <- start[1L] + (start[2L] - 1)/frequency
## }
## if (!missing(end))
## end <- as.numeric(end)
## if (length(end) > 1L) {
## end <- end[1L] + (end[2L] - 1)/frequency
## }
## if (missing(end))
## end <- start + (ndata - 1)/frequency
## else if (missing(start))
## start <- end - (ndata - 1)/frequency
## if (start > end)
## stop("'start' cannot be after 'end'")
## cycles <- as.numeric((end - start) * frequency)
## if (abs(round(cycles) - cycles) > ts.eps * max(cycles, 1))
## stop("'end' must be a whole number of cycles after 'start'")
## nobs <- floor(cycles + 1.01)
## if (nobs != ndata)
## data <- if (NCOL(data) == 1) {
## if (ndata < nobs)
## rep_len(data, nobs)
## else if (ndata > nobs)
## data[1L:nobs]
## }
## else {
## if (ndata < nobs)
## data[rep_len(1L:ndata, nobs), ]
## else if (ndata > nobs)
## data[1L:nobs, ]
## }
## attr(data, "tsp") <- c(start, end, frequency)
## if (!is.null(class) && class[[1]] != "none")
## attr(data, "class") <- class
## data
## }
## <bytecode: 0x7fbf0f17b0e8>
## <environment: namespace:stats>
Then press shift + tab:
# Wed Nov 11 10:42:54 2020 ------------------------------
ggplot2!ggplot(ChickWeight, aes(Time, weight, group = Chick)) +
geom_line(aes(color = Diet)) +
facet_wrap(~Diet) +
theme_bw() +
labs(y = "Weight") +
scale_color_viridis_d()
ggplot2. Replace data with the variable you are facetting with dropped and change color to gray like in the second line of code below.ggplot(ChickWeight, aes(Time, weight, group = Chick)) +
geom_line(data = select(ChickWeight, -Diet), color = "gray") +
geom_line(aes(color = Diet)) +
facet_wrap(~Diet) +
theme_bw() +
labs(y = "Weight") +
scale_color_viridis_d()
Below are some tips and tricks that I have picked up along my R journey, that may (or may not!) be useful for those working with sport science data, or any data, in R. I hope you find the tips useful!
If you are interested in working with some real life netball data, you can see a slidedeck that I put together of how to import, tidy and analyse the data here.
Sometimes, someone will share some data with us via Dropbox. Often these files can be really big and it is annoying to download a local copy of these files on your machine. If you have been sent a link to view these files, the following may be useful so you can load them directly into R, without having to save the file locally on your machine first.
# Install the {vroom} package
install.packages(vroom)
# Load the package from your library
library(vroom)
# Load your data directly into R!
data <- vroom("https://www.dropbox.com/s/sometexthere/NameOfFile.csv?dl=1")
Use R to quickly import AMS your data and not even open a web browser! To do this, please follow the steps below. Note – I am using Smartabase as an example here because it is familiar to me.
# Load required packages
library(rvest)
library(plyr)
library(dplyr)
library(qdap)
# Connect to a report that you have generated via Smartabase
WellnessDataURL <- html_session("https://username:password@my2.smartabase.com/yourteamsname/live?report=WellnessData&updategroup=true")
# Read in data
WellnessData <- read_html(WellnessDataURL)
# Identify the table
WellnessDataTable <- html_nodes(WellnessData, "table")
# Collect only table data
WellnessDataTable1 <- html_table(WellnessDataTable[1], fill = TRUE)
# Make data.frame
HistoricalWellnessData <- as.data.frame(WellnessDataTable1)
# Clean Environment
rm(list = grep("^HistoricalWellnessData", ls(), value = TRUE, invert = TRUE))
Now your AMS data is in a neat data.frame and ready for any further statistical analysis or visualisation using R, without needing to open a web browser!
![
In Melbourne we have been in a strict lockdown since July. Each week we get our hopes up that restrictions might be eased, and once again these hopes are dashed by the announcement Sunday Oct 25, keeping the restrictions a little longer because of another outbreak in the northwest of the city. The data we have collected here are the case counts by Victorian local government area (LGA) since the beginning of July. We will examine the spatiotemporal distribution of these counts.
Working with spatial data is always painful! It almost always requires some ugly code. Part of the reason for the difficulty is the use of special data objects, that describe maps. There are several different choices, and some packages and tools use one, and others use another, so not all tools work together. The sf package is a recent endeavour that helps enormously, but some tools still use other forms, and when you run into errors this might be the reason - it can be hard to tell. Another reason is that map objects can be very large, which makes sense for accurate mapping, but for data analysis and visualisation, we’d rather have smaller, even if slightly inaccurate, spatial objects. It can be helpful to thin out map data before doing further analysis - you need special tools for this, eg mapshapr. We don’t need this for the exercises here. Another problem commonly encountered is that there are numerous coordinate systems, and types of projections of the 3D globe into a 2D canvas. We have become accustomed to lat/long but like time its an awkward scale to compute on because a translation from E/W and N/S to positive and negative values is needed. More commonly a Universal Transverse Mercator (UTM) is the standard but its far less intuitive to use.
The code for all the analysis is provided for you. We recommend that you run the code in steps to see what it is doing, why the mutating and text manipulations are necessary. Talk about the code with each other to help you understand it.
COVID-19 data by LGA is available from https://www.covid19data.com.au/victoria. You should find that some variables are type chr because “null” has been used to code entries on some days. This needs fixing, and also missings should be converted to 0, with the rationale being that if the value is missing it most likely means there were no cases.
# Read the data
# Replace null with 0, for three LGAs
# Convert to long form to join with polygons
# Make the date variables a proper date
# Set NAs to 0, this is a reasonable assumption
covid <- read_csv("https://raw.githubusercontent.com/numbats/eda/master/data/melb_lga_covid.csv") %>%
mutate(Buloke = as.numeric(ifelse(Buloke == "null", "0", Buloke))) %>%
mutate(Hindmarsh = as.numeric(ifelse(Hindmarsh == "null", "0", Hindmarsh))) %>%
mutate(Towong = as.numeric(ifelse(Towong == "null", "0", Towong))) %>%
pivot_longer(cols = Alpine:Yarriambiack, names_to="NAME", values_to="cases") %>%
mutate(Date = ydm(paste0("2020/",Date))) %>%
mutate(cases=replace_na(cases, 0))
# Case counts are cumulative, so take lags to get daily case counts
covid <- covid %>%
group_by(NAME) %>%
mutate(new_cases = cases - dplyr::lag(cases))
# Filter to final day, which is cumulative count
covid_cumul <- covid %>%
filter(Date == max(Date))
Now let’s get polygon data of Victorian LGAs from the ozmaps package. We need to fix some names of LGAs because there are duplicated LGA names, and there is one mismatch in names from the COVID data and the ozmaps data (Colac Otway). If the COVID data had been provided with a unique LGA code it would have helped in merging with the polygon data.
# Read the LGA data from ozmaps package.
# This has LGAs for all of Australia.
# Need to filter out Victoria LGAs, avoiding LGAs
# from other states with same name, and make the names
# match covid data names. The regex equation is
# removing () state and LGA type text strings
# Good reference: https://r-spatial.github.io/sf/articles/sf1.html
data("abs_lga")
vic_lga <- abs_lga %>%
mutate(NAME = ifelse(NAME == "Latrobe (M) (Tas.)", "LatrobeM", NAME)) %>%
mutate(NAME = ifelse(NAME == "Kingston (DC) (SA)", "KingstonSA", NAME)) %>%
mutate(NAME = ifelse(NAME == "Bayside (A)", "BaysideA", NAME)) %>%
mutate(NAME = str_replace(NAME, " \\(.+\\)", "")) %>%
mutate(NAME = ifelse(NAME == "Colac-Otway", "Colac Otway", NAME))
vic_lga <- st_transform(vic_lga, 3395)
# 3395 is EPSG CRS, equiv to WGS84 mercator,
# see https://spatialreference.org/ref/epsg/?page=28
# cartogram() needs this to be set
A choropleth map is made from filling the colour of polygons.
# Join covid data to polygon data, remove LGAs with
# missing values which should leave just Vic LGAs
vic_lga_covid <- vic_lga %>%
left_join(covid_cumul, by="NAME") %>%
filter(!is.na(cases))
# Make choropleth map, with appropriate colour palette
choropleth <- ggplot(vic_lga_covid) +
geom_sf(aes(fill = cases, label=NAME), colour="white") +
scale_fill_distiller("Cases", palette = "YlOrRd",
direction=1) +
theme_map() +
theme(legend.position="bottom")
choropleth
#ggplotly(choropleth) # Interactive map
The file VIF2019_Population_Service_Ages_LGA_2036.xlsx has been extracted from the Vic Gov web site. It is a complicated xlsx file, with the data in sheet 3, and starting 13 rows down. The readxl package is handy here to extract the population data needed. The code below has extracted only the data needed.
pop <- tibble(NAME = c("Alpine","Ararat","Ballarat","Banyule","Bass Coast","Baw Baw","Bayside","Benalla","Boroondara","Brimbank","Buloke","Campaspe","Cardinia","Casey","Central Goldfields","Colac Otway","Corangamite","Darebin","East Gippsland","Frankston","Gannawarra","Glen Eira","Glenelg","Golden Plains","Greater Bendigo","Greater Dandenong","Greater Geelong","Greater Shepparton","Hepburn","Hindmarsh","Hobsons Bay","Horsham","Hume","Indigo","Kingston","Knox","Latrobe","Loddon","Macedon Ranges","Manningham","Mansfield","Maribyrnong","Maroondah","Melbourne","Melton","Mildura","Mitchell","Moira","Monash","Moonee Valley","Moorabool","Moreland","Mornington Peninsula","Mount Alexander","Moyne","Murrindindi","Nillumbik","Northern Grampians","Port Phillip","Pyrenees","Queenscliffe","South Gippsland","Southern Grampians","Stonnington","Strathbogie","Surf Coast","Swan Hill","Towong","Wangaratta","Warrnambool","Wellington","West Wimmera","Whitehorse","Whittlesea","Wodonga","Wyndham","Yarra","Yarra Ranges","Yarriambiack"),
pop = c(12578,11746.43,103500.3,127447,33464.85,49296.21,102912,13981.3,177276,204190,6284,37596.09,97572.66,312789,13085.32,21362.81,16241,155126,45598.55,139502,10567.15,148583,19758.61,22016,112270.9,160222,239529.9,65071.32,15526.87,5787.223,93445.04,19884.51,207038,16165.73,158937.6,160353.5,74622.36,7559.041,47479.75,122570.7,8674.158,86942,114799.3,146097.1,141422.1,54658,41794.85,29486,192625,122871.1,32668.76,172289.5,161528,19093.7,16738.47,14052.73,64174,11570.29,108627.2,7315.398,2927.166,29120.95,16122.74,111003,10357.01,30465.01,20895.86,6045.765,28592,34243.1,43531.44,3932.907,169641.6,207058,40100,227008,92898.52,155227.4,6742.772))
vic_lga_covid <- vic_lga_covid %>%
left_join(pop, by="NAME")
# Compute additional statistics
vic_lga_covid <- vic_lga_covid %>%
group_by(NAME) %>%
mutate(cases_per10k = max(cases/pop*10000, 0)) %>%
ungroup()
vic_lga_covid_carto <- cartogram_cont(vic_lga_covid, "pop")
# This st_cast() is necessary to get plotly to work
vic_lga_covid_carto <- st_cast(vic_lga_covid_carto, "MULTIPOLYGON")
cartgram <- ggplot(vic_lga_covid_carto) +
geom_sf(aes(fill = cases_per10k, label=NAME), colour="white") +
scale_fill_distiller("Cases", palette = "YlOrRd",
direction=1) +
theme_map() +
theme(legend.position="bottom")
cartgram
#ggplotly(cartgram) # Interactive cartogram
The hexagon tile map makes tiled hexagons representing the LGAs. You can read more about it in the documentation for the sugarbag package at https://srkobakian.github.io/sugarbag/.
# Spatial coordinates need to be in long/lat
vlc_latlong <- st_transform(vic_lga_covid, crs = "+proj=longlat +datum=WGS84")
# Placement of hexmaps depends on position relative to
# Melbourne central
data(capital_cities)
vic_lga_hexmap <- create_hexmap(
shp = vlc_latlong,
sf_id = "NAME",
focal_points = capital_cities, verbose = TRUE)
# This shows the centroids of the hexagons
# ggplot(vic_lga_hexmap, aes(x=hex_long, y=hex_lat)) +
# geom_point()
# Hexagons are made with the `fortify_hexagon` function
vic_lga_covid_hexmap <- vic_lga_hexmap %>%
fortify_hexagon(sf_id = "NAME", hex_size = 0.1869) %>%
left_join(covid_cumul, by="NAME") %>%
filter(!is.na(cases)) %>%
left_join(pop, by="NAME") %>%
group_by(NAME) %>%
mutate(cases_per10k = max(cases/pop*10000, 0)) %>%
ungroup()
hexmap <- ggplot() +
geom_sf(data=vlc_latlong,
fill = "grey90", colour = "white", size=0.1) +
geom_polygon(data=vic_lga_covid_hexmap,
aes(x=long, y=lat, group=hex_id,
fill = cases_per10k,
colour = cases_per10k,
label=NAME),
size=0.2) +
scale_fill_distiller("Cases", palette = "YlOrRd",
direction=1) +
scale_colour_distiller("Cases", palette = "YlOrRd",
direction=1) +
theme_map() +
theme(legend.position="bottom", aspect.ratio=0.7)
hexmap
# ggplotly(hexmap)
A work by R-Ladies Melbourne